// 文档 https://github.com/hooke007/MPV_lazy/wiki/4_GLSL

// CuNNy faster SOFT
// Copyright (c) 2024 funnyplanter

// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 3.0 of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this program.  If not, see <https://www.gnu.org/licenses/>.
/* ------------------------------------------------------------------- */


//!DESC [CuNNy_faster_SOFT] -in
//!HOOK LUMA
//!COMPUTE 16 8 8 8
//!BIND LUMA
//!SAVE in
//!WIDTH LUMA.w 2 *
//!HEIGHT LUMA.h
//!COMPONENTS 4
//!WHEN OUTPUT.w LUMA.w 1.200 * > OUTPUT.h LUMA.h 1.200 * > *
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable
#ifdef GL_EXT_shader_explicit_arithmetic_types_float16
#	define V4 f16vec4
#	define M4 f16mat4
#	define F float16_t
#else
#	define V4 vec4
#	define M4 mat4
#	define F float
#endif
#define l0(x, y) F((LUMA_mul * texelFetch(LUMA_raw, clamp(pos + ivec2(x, y), ivec2(0), sz) * ivec2(1, 1) + ivec2(0, 0), 0)).r)
shared F G[1][10][10];
void hook() {
	ivec2 xy = ivec2(gl_LocalInvocationID.xy);
	ivec2 pos = ivec2(gl_WorkGroupID.xy) * ivec2(8, 8) + xy;
	ivec2 opos = pos * ivec2(2, 1);
	ivec2 sz = ivec2(LUMA_size) - ivec2(1);
	for (int y = 0; y < 10; y += 8) {
		int ay = xy.y + y;
		if (ay >= 10) break;
		for (int x = 0; x < 10; x += 8) {
			int ax = xy.x + x;
			if (ax >= 10) break;
			G[0][ay][ax] = l0(x - 1, y - 1);
		}
	}
	barrier();
	F s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2;
	V4 r0, r1;
	r0 = V4(0.0); r1 = V4(0.0);
	s0_0_0 = G[0][xy.y+0][xy.x+0]; s0_0_1 = G[0][xy.y+0][xy.x+1];
	s0_0_2 = G[0][xy.y+0][xy.x+2]; s0_1_0 = G[0][xy.y+1][xy.x+0];
	s0_1_1 = G[0][xy.y+1][xy.x+1]; s0_1_2 = G[0][xy.y+1][xy.x+2];
	s0_2_0 = G[0][xy.y+2][xy.x+0]; s0_2_1 = G[0][xy.y+2][xy.x+1];
	s0_2_2 = G[0][xy.y+2][xy.x+2];
	r0 += V4(-6.186e-02, 2.542e-02, -3.126e-02, 4.276e-01) * s0_0_0;
	r1 += V4(1.024e-01, 2.983e-01, 1.847e-02, 7.229e-03) * s0_0_0;
	r0 += V4(2.466e-01, -3.970e-02, -3.019e-01, -2.549e-01) * s0_0_1;
	r1 += V4(3.331e-01, -1.843e-01, 4.878e-03, -1.012e+00) * s0_0_1;
	r0 += V4(-1.487e-02, 1.207e-02, 1.938e-02, -1.742e-01) * s0_0_2;
	r1 += V4(2.189e-02, -1.410e-01, -1.995e-02, 3.394e-02) * s0_0_2;
	r0 += V4(-3.043e-02, -5.645e-02, -2.347e-01, 6.738e-01) * s0_1_0;
	r1 += V4(2.826e-01, -6.736e-01, -9.939e-01, -3.110e-03) * s0_1_0;
	r0 += V4(-1.997e-01, 7.020e-01, 1.021e+00, -6.116e-01) * s0_1_1;
	r1 += V4(-1.371e+00, 8.032e-01, 9.512e-01, 1.035e+00) * s0_1_1;
	r0 += V4(1.465e-01, -4.654e-03, -1.250e-01, -9.517e-02) * s0_1_2;
	r1 += V4(1.188e-01, -8.951e-02, 3.938e-02, -5.260e-02) * s0_1_2;
	r0 += V4(-3.857e-02, 3.157e-02, 2.097e-02, -1.312e-01) * s0_2_0;
	r1 += V4(2.049e-02, -3.059e-01, 4.589e-02, -8.052e-03) * s0_2_0;
	r0 += V4(9.864e-02, 9.063e-02, -3.894e-02, 3.186e-03) * s0_2_1;
	r1 += V4(3.989e-02, 3.252e-01, -3.164e-02, -2.699e-02) * s0_2_1;
	r0 += V4(-3.073e-02, -7.637e-01, -1.021e-01, 1.632e-01) * s0_2_2;
	r1 += V4(1.266e-01, -3.480e-02, -1.752e-02, 1.867e-02) * s0_2_2;
	r0 += V4(4.001e-03, 2.329e-06, 2.258e-04, 3.230e-03);
	r0 = clamp(r0, V4(0.0), V4(1.0));
	imageStore(out_image, opos + ivec2(0, 0), vec4(r0));
	r1 += V4(1.177e-03, 2.971e-03, -1.022e-04, 6.960e-04);
	r1 = clamp(r1, V4(0.0), V4(1.0));
	imageStore(out_image, opos + ivec2(1, 0), vec4(r1));
}

//!DESC [CuNNy_faster_SOFT] -conv1
//!HOOK LUMA
//!COMPUTE 16 8 8 8
//!BIND in
//!BIND LUMA
//!SAVE conv1
//!WIDTH LUMA.w 2 *
//!HEIGHT LUMA.h
//!COMPONENTS 4
//!WHEN OUTPUT.w LUMA.w 1.200 * > OUTPUT.h LUMA.h 1.200 * > *
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable
#ifdef GL_EXT_shader_explicit_arithmetic_types_float16
#	define V4 f16vec4
#	define M4 f16mat4
#	define F float16_t
#else
#	define V4 vec4
#	define M4 mat4
#	define F float
#endif
#define l0(x, y) V4((in_mul * texelFetch(in_raw, clamp(pos + ivec2(x, y), ivec2(0), sz) * ivec2(2, 1) + ivec2(0, 0), 0)))
#define l1(x, y) V4((in_mul * texelFetch(in_raw, clamp(pos + ivec2(x, y), ivec2(0), sz) * ivec2(2, 1) + ivec2(1, 0), 0)))
shared V4 G[2][10][10];
void hook() {
	ivec2 xy = ivec2(gl_LocalInvocationID.xy);
	ivec2 pos = ivec2(gl_WorkGroupID.xy) * ivec2(8, 8) + xy;
	ivec2 opos = pos * ivec2(2, 1);
	ivec2 sz = ivec2(LUMA_size) - ivec2(1);
	for (int y = 0; y < 10; y += 8) {
		int ay = xy.y + y;
		if (ay >= 10) break;
		for (int x = 0; x < 10; x += 8) {
			int ax = xy.x + x;
			if (ax >= 10) break;
			G[0][ay][ax] = l0(x - 1, y - 1);
			G[1][ay][ax] = l1(x - 1, y - 1);
		}
	}
	barrier();
	V4 s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2, s1_0_0, s1_0_1, s1_0_2, s1_1_0, s1_1_1, s1_1_2, s1_2_0, s1_2_1, s1_2_2;
	V4 r0, r1;
	r0 = V4(0.0); r1 = V4(0.0);
	s0_0_0 = G[0][xy.y+0][xy.x+0]; s0_0_1 = G[0][xy.y+0][xy.x+1];
	s0_0_2 = G[0][xy.y+0][xy.x+2]; s0_1_0 = G[0][xy.y+1][xy.x+0];
	s0_1_1 = G[0][xy.y+1][xy.x+1]; s0_1_2 = G[0][xy.y+1][xy.x+2];
	s0_2_0 = G[0][xy.y+2][xy.x+0]; s0_2_1 = G[0][xy.y+2][xy.x+1];
	s0_2_2 = G[0][xy.y+2][xy.x+2]; s1_0_0 = G[1][xy.y+0][xy.x+0];
	s1_0_1 = G[1][xy.y+0][xy.x+1]; s1_0_2 = G[1][xy.y+0][xy.x+2];
	s1_1_0 = G[1][xy.y+1][xy.x+0]; s1_1_1 = G[1][xy.y+1][xy.x+1];
	s1_1_2 = G[1][xy.y+1][xy.x+2]; s1_2_0 = G[1][xy.y+2][xy.x+0];
	s1_2_1 = G[1][xy.y+2][xy.x+1]; s1_2_2 = G[1][xy.y+2][xy.x+2];
	r0 += M4(2.531e-01, -2.668e-01, -4.434e-01, -3.488e-01, -3.167e-01, 2.829e-01, 2.734e-01, -8.829e-02, 3.571e-02, -3.139e-01, -4.832e-01, -1.988e-01, 1.013e-01, -7.879e-02, -1.205e-02, -2.966e-03) * s0_0_0;
	r1 += M4(-1.860e-01, -2.488e-01, 2.754e-01, -9.522e-02, -8.808e-02, -3.456e-01, 3.734e-02, -3.121e-03, -9.968e-02, -8.389e-02, 1.492e-01, -1.967e-01, -3.855e-02, -3.605e-02, 6.806e-02, -6.793e-02) * s0_0_0;
	r0 += M4(1.000e+00, -2.697e-02, -2.954e-01, -9.928e-02, 1.167e-01, 8.036e-02, 2.725e-01, 3.225e-01, 4.269e-01, -2.678e-01, -3.617e-01, -4.284e-02, -1.270e-01, -1.189e-01, 2.960e-02, 5.395e-03) * s0_0_1;
	r1 += M4(-2.674e-01, -1.390e-01, -3.281e-01, -2.881e-01, 5.304e-01, -2.017e-01, 1.386e-01, 1.276e-01, 3.564e-02, 1.365e-01, 1.000e+00, -2.770e-01, -4.805e-02, -8.303e-02, -1.559e-01, 8.275e-02) * s0_0_1;
	r0 += M4(1.387e-01, -1.472e-01, 3.410e-02, 1.541e-01, 1.164e-01, 1.109e-02, 4.897e-02, -1.545e-01, -1.595e-01, -1.333e-01, -5.546e-02, 1.449e-01, 5.512e-02, 3.440e-03, -5.197e-02, -5.259e-02) * s0_0_2;
	r1 += M4(1.162e-01, 2.977e-01, 3.432e-01, -2.183e-01, -8.881e-02, 6.447e-03, 6.431e-02, 8.472e-02, 1.841e-01, 8.025e-02, 3.675e-01, -2.026e-01, -1.547e-01, -2.545e-02, 1.175e-01, 9.394e-03) * s0_0_2;
	r0 += M4(-4.587e-01, -8.142e-01, 1.321e-02, 2.854e-01, -3.730e-01, -2.065e-01, -6.945e-02, -1.123e-01, -1.944e-01, 7.715e-01, 4.823e-01, -1.225e-01, -4.595e-01, -1.769e-01, -1.420e-01, -4.301e-02) * s0_1_0;
	r1 += M4(4.996e-01, -1.184e-01, -2.518e-01, 1.240e-01, -1.653e-01, -1.635e-01, -1.156e-01, 5.168e-01, -2.904e-01, -1.410e-01, -1.034e-01, 2.905e-01, -5.582e-02, -1.641e-02, -2.257e-03, -1.478e-01) * s0_1_0;
	r0 += M4(8.945e-03, 4.407e-01, -5.606e-02, 3.282e-01, -3.302e-01, -4.006e-02, 2.855e-01, 2.246e-01, 1.000e+00, 4.115e-01, -8.375e-01, -6.501e-01, 2.530e-01, -4.878e-02, 3.342e-03, 6.213e-02) * s0_1_1;
	r1 += M4(2.971e-01, -1.556e-01, 4.343e-01, 2.677e-01, -4.879e-01, -2.645e-01, -9.983e-02, -1.427e-02, -1.000e+00, -5.764e-01, -6.766e-01, -5.257e-01, -2.670e-01, 3.785e-01, -2.647e-02, -1.953e-01) * s0_1_1;
	r0 += M4(2.508e-01, 3.018e-02, 2.196e-01, -3.577e-02, -1.009e-01, -1.261e-02, -1.814e-03, -1.987e-01, 2.391e-01, -4.876e-02, 7.285e-01, 1.000e+00, -2.117e-01, -7.766e-02, 2.389e-02, -4.548e-01) * s0_1_2;
	r1 += M4(-7.533e-02, 8.456e-01, 4.277e-01, 5.663e-02, -5.388e-02, -9.114e-02, -9.255e-02, -3.440e-02, 3.913e-01, 6.421e-01, 3.159e-02, -7.582e-02, -1.577e-01, 2.428e-01, -3.961e-02, -1.049e-01) * s0_1_2;
	r0 += M4(-7.553e-01, 2.089e-01, -6.286e-01, -3.363e-02, 5.772e-02, 2.262e-02, 9.164e-03, -3.519e-02, -3.376e-01, -9.360e-02, -3.205e-01, 3.534e-02, -1.834e-01, -1.035e-01, -3.573e-02, 4.470e-02) * s0_2_0;
	r1 += M4(4.981e-01, -2.162e-01, -8.847e-02, -2.245e-01, -9.954e-02, -3.693e-02, 2.656e-02, -1.609e-01, 2.299e-01, 3.232e-01, -8.358e-02, 1.000e+00, 8.003e-02, -3.522e-03, -6.680e-02, 1.963e-01) * s0_2_0;
	r0 += M4(-2.883e-01, 1.610e-01, -2.128e-02, -3.403e-01, 8.860e-02, 6.158e-03, 2.620e-02, 6.692e-02, -6.458e-01, -4.542e-02, 4.432e-01, -1.674e-01, 1.999e-01, 1.015e-01, 3.891e-01, 3.813e-02) * s0_2_1;
	r1 += M4(-8.155e-01, -2.776e-01, -2.048e-01, 4.006e-01, -2.585e-03, -1.585e-01, 1.582e-02, -1.143e-02, 4.515e-01, 2.086e-01, -4.075e-01, 2.312e-01, 2.053e-01, 7.326e-02, -1.460e-01, -6.125e-02) * s0_2_1;
	r0 += M4(-2.352e-01, -8.222e-02, 1.000e+00, 5.267e-02, 7.237e-02, 1.513e-02, -1.180e-01, -1.319e-02, -3.193e-01, -4.027e-02, 5.020e-01, 1.887e-02, 8.735e-02, 2.248e-02, -8.337e-03, -1.118e-01) * s0_2_2;
	r1 += M4(1.127e-01, -3.352e-01, -6.005e-01, -2.480e-01, -3.016e-02, 1.392e-01, 4.873e-02, -1.361e-02, 9.770e-03, -4.032e-01, -2.803e-01, -1.304e-01, 4.222e-02, 1.304e-01, -7.874e-02, 2.961e-02) * s0_2_2;
	r0 += M4(-1.717e-01, 9.015e-02, 1.951e-01, 1.133e-01, 1.623e-01, -6.030e-02, 3.098e-03, -4.084e-02, 2.097e-02, 1.163e-01, 2.155e-01, 8.350e-02, 7.325e-02, 8.615e-02, 1.434e-01, 4.309e-02) * s1_0_0;
	r1 += M4(-3.414e-02, -2.101e-02, -1.556e-01, 7.275e-03, 1.157e-01, 9.838e-02, 2.796e-02, 1.977e-02, 1.020e-02, 1.705e-02, -6.278e-02, 9.780e-02, 4.824e-02, 1.022e-02, -3.255e-02, 8.569e-02) * s1_0_0;
	r0 += M4(-5.752e-01, 7.673e-02, 2.289e-01, -6.075e-03, 5.408e-01, 1.596e-01, 1.153e-02, -3.004e-02, -5.835e-01, 1.883e-01, 5.075e-02, -1.498e-02, -1.815e-02, 8.536e-02, 2.770e-02, -3.721e-02) * s1_0_1;
	r1 += M4(1.018e-01, 1.223e-01, 4.915e-03, 1.792e-01, 2.536e-01, 3.313e-01, 1.017e-01, -1.145e-01, 5.904e-03, -2.676e-01, -3.135e-01, 9.968e-02, -1.411e-01, -1.675e-02, -3.311e-01, 3.405e-02) * s1_0_1;
	r0 += M4(5.881e-02, 1.055e-01, -1.131e-02, -2.213e-01, 2.114e-01, 7.374e-02, 9.498e-03, 1.534e-01, -1.796e-01, 2.486e-02, -5.175e-02, 1.384e-01, -8.095e-03, 2.765e-02, 5.384e-02, -1.006e-02) * s1_0_2;
	r1 += M4(-2.427e-01, -6.800e-02, -3.104e-01, 1.868e-01, 5.677e-02, 1.816e-01, 3.051e-02, -7.078e-02, 1.635e-01, -3.271e-01, -3.693e-01, 1.163e-01, 1.679e-02, -6.450e-02, -9.913e-02, 3.605e-02) * s1_0_2;
	r0 += M4(-2.779e-01, -4.980e-01, -5.183e-01, 1.382e-01, -2.049e-01, 8.618e-02, 1.811e-01, -9.254e-02, 1.654e-01, -1.528e-01, -1.073e-01, 8.763e-02, -1.575e-01, 4.321e-02, -2.500e-03, 2.440e-02) * s1_1_0;
	r1 += M4(2.611e-01, 1.553e-01, 6.232e-02, -5.567e-01, 2.076e-01, 2.361e-01, -4.105e-02, 4.395e-01, -1.621e-01, -8.412e-02, 6.264e-02, -2.607e-01, 1.909e-02, 6.968e-02, 3.849e-02, -4.311e-02) * s1_1_0;
	r0 += M4(-1.000e+00, -3.113e-01, 7.559e-01, 3.673e-01, -6.455e-01, -3.621e-01, 6.043e-02, -5.042e-02, -1.000e+00, -2.153e-01, -7.024e-01, 7.721e-02, -2.525e-01, 2.224e-01, -3.304e-01, 2.518e-02) * s1_1_1;
	r1 += M4(8.053e-01, 4.907e-01, 2.522e-01, 2.032e-01, 1.832e-01, -1.616e-01, -8.360e-03, -3.155e-02, -4.129e-01, -1.871e-01, 1.772e-01, 6.226e-01, -4.019e-01, -3.447e-01, -1.000e+00, 1.624e-02) * s1_1_1;
	r0 += M4(-4.968e-03, 3.226e-02, -6.048e-01, -7.462e-01, -1.524e-01, -3.218e-02, -5.323e-02, 9.426e-02, 2.223e-01, 5.683e-02, -5.840e-01, -4.058e-01, 2.175e-01, 3.490e-02, -7.231e-02, -9.363e-02) * s1_1_2;
	r1 += M4(-2.112e-01, -2.164e-01, 1.032e-01, 8.272e-02, 1.875e-01, -1.249e-01, -1.289e-01, -1.775e-02, -9.285e-02, -4.797e-01, 5.153e-02, 1.228e-01, 9.270e-02, -2.105e-01, 2.417e-01, -3.332e-02) * s1_1_2;
	r0 += M4(1.559e-01, 6.582e-02, 1.862e-02, -1.908e-02, 4.826e-02, 3.162e-02, 1.835e-01, 3.838e-03, 1.224e-01, 1.645e-02, -1.087e-01, 1.745e-02, -3.117e-01, -2.329e-01, 5.174e-02, -1.140e-01) * s1_2_0;
	r1 += M4(-1.155e-01, -3.997e-01, 7.610e-02, -6.082e-01, -1.165e-01, 7.291e-03, -2.308e-02, 1.965e-01, -6.649e-02, -1.257e-01, 5.972e-02, -5.803e-01, -2.302e-01, 2.191e-02, 1.194e-01, -7.160e-01) * s1_2_0;
	r0 += M4(4.385e-01, -4.891e-02, -2.998e-01, 1.548e-01, 1.774e-03, -4.469e-02, -2.459e-02, 1.824e-02, 1.000e+00, 9.917e-02, 1.082e-01, 1.235e-01, -6.440e-01, 1.065e-02, -5.689e-01, 9.328e-02) * s1_2_1;
	r1 += M4(-1.738e-01, -2.261e-01, 2.455e-01, -1.314e-01, 1.664e-01, -7.752e-02, 1.912e-02, -6.457e-02, -4.690e-01, 4.254e-01, 6.063e-02, -1.000e+00, -4.421e-01, 3.165e-01, 9.387e-02, 1.069e-01) * s1_2_1;
	r0 += M4(2.081e-01, 4.184e-02, -2.829e-01, -4.588e-02, -1.070e-02, 5.121e-02, -1.899e-01, 1.614e-02, 2.845e-01, -1.529e-02, -2.913e-01, 2.414e-01, 1.724e-01, -5.550e-03, -1.764e-01, -5.088e-01) * s1_2_2;
	r1 += M4(-1.492e-02, 2.477e-01, 1.978e-01, 1.033e-01, -1.993e-03, -2.744e-01, 2.779e-02, 1.960e-01, 1.137e-01, 3.369e-01, 2.611e-02, -6.703e-02, 4.059e-02, 2.131e-01, 1.551e-01, -2.639e-02) * s1_2_2;
	r0 = clamp(r0, V4(0.0), V4(1.0));
	imageStore(out_image, opos + ivec2(0, 0), vec4(r0));
	r1 = clamp(r1, V4(0.0), V4(1.0));
	imageStore(out_image, opos + ivec2(1, 0), vec4(r1));
}

//!DESC [CuNNy_faster_SOFT] -conv2
//!HOOK LUMA
//!COMPUTE 16 8 8 8
//!BIND conv1
//!BIND LUMA
//!SAVE conv2
//!WIDTH LUMA.w 2 *
//!HEIGHT LUMA.h
//!COMPONENTS 4
//!WHEN OUTPUT.w LUMA.w 1.200 * > OUTPUT.h LUMA.h 1.200 * > *
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable
#ifdef GL_EXT_shader_explicit_arithmetic_types_float16
#	define V4 f16vec4
#	define M4 f16mat4
#	define F float16_t
#else
#	define V4 vec4
#	define M4 mat4
#	define F float
#endif
#define l0(x, y) V4((conv1_mul * texelFetch(conv1_raw, clamp(pos + ivec2(x, y), ivec2(0), sz) * ivec2(2, 1) + ivec2(0, 0), 0)))
#define l1(x, y) V4((conv1_mul * texelFetch(conv1_raw, clamp(pos + ivec2(x, y), ivec2(0), sz) * ivec2(2, 1) + ivec2(1, 0), 0)))
shared V4 G[2][10][10];
void hook() {
	ivec2 xy = ivec2(gl_LocalInvocationID.xy);
	ivec2 pos = ivec2(gl_WorkGroupID.xy) * ivec2(8, 8) + xy;
	ivec2 opos = pos * ivec2(2, 1);
	ivec2 sz = ivec2(LUMA_size) - ivec2(1);
	for (int y = 0; y < 10; y += 8) {
		int ay = xy.y + y;
		if (ay >= 10) break;
		for (int x = 0; x < 10; x += 8) {
			int ax = xy.x + x;
			if (ax >= 10) break;
			G[0][ay][ax] = l0(x - 1, y - 1);
			G[1][ay][ax] = l1(x - 1, y - 1);
		}
	}
	barrier();
	V4 s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2, s1_0_0, s1_0_1, s1_0_2, s1_1_0, s1_1_1, s1_1_2, s1_2_0, s1_2_1, s1_2_2;
	V4 r0, r1;
	r0 = V4(0.0); r1 = V4(0.0);
	s0_0_0 = G[0][xy.y+0][xy.x+0]; s0_0_1 = G[0][xy.y+0][xy.x+1];
	s0_0_2 = G[0][xy.y+0][xy.x+2]; s0_1_0 = G[0][xy.y+1][xy.x+0];
	s0_1_1 = G[0][xy.y+1][xy.x+1]; s0_1_2 = G[0][xy.y+1][xy.x+2];
	s0_2_0 = G[0][xy.y+2][xy.x+0]; s0_2_1 = G[0][xy.y+2][xy.x+1];
	s0_2_2 = G[0][xy.y+2][xy.x+2]; s1_0_0 = G[1][xy.y+0][xy.x+0];
	s1_0_1 = G[1][xy.y+0][xy.x+1]; s1_0_2 = G[1][xy.y+0][xy.x+2];
	s1_1_0 = G[1][xy.y+1][xy.x+0]; s1_1_1 = G[1][xy.y+1][xy.x+1];
	s1_1_2 = G[1][xy.y+1][xy.x+2]; s1_2_0 = G[1][xy.y+2][xy.x+0];
	s1_2_1 = G[1][xy.y+2][xy.x+1]; s1_2_2 = G[1][xy.y+2][xy.x+2];
	r0 += M4(1.522e-03, -1.920e-01, -1.638e-02, 7.138e-03, -2.208e-03, 2.931e-02, 2.245e-02, 1.518e-02, -1.233e-02, -8.466e-02, -8.630e-02, -1.476e-02, -9.569e-03, -2.152e-01, -1.948e-02, -1.301e-01) * s0_0_0;
	r1 += M4(-2.903e-02, 7.501e-04, -1.192e-03, 1.136e-02, -6.480e-04, 3.209e-02, -1.238e-02, 9.244e-03, -3.095e-02, -1.199e-01, 6.501e-03, -3.038e-02, 1.659e-02, -1.052e-01, -3.452e-02, -1.127e-01) * s0_0_0;
	r0 += M4(2.241e-02, -6.385e-02, -1.720e-01, -1.801e-02, -2.928e-03, -2.800e-02, -5.050e-02, 2.295e-02, 9.457e-03, -7.361e-02, -2.876e-02, -1.586e-01, 1.250e-02, -3.825e-03, 1.512e-01, 5.916e-02) * s0_0_1;
	r1 += M4(-5.845e-02, -1.153e-01, 9.396e-02, -7.471e-02, -2.363e-02, -5.633e-02, 8.595e-03, 1.534e-03, -1.156e-02, -4.734e-02, -1.073e-01, -4.199e-02, 4.480e-02, 9.541e-02, -3.125e-02, 3.180e-02) * s0_0_1;
	r0 += M4(-2.082e-02, -1.016e-02, 1.812e-02, -2.512e-01, 1.714e-03, -1.188e-02, 2.312e-02, -4.969e-02, 1.009e-02, -5.921e-02, 3.138e-02, 1.382e-01, -8.651e-03, 4.460e-03, -3.529e-02, 2.305e-03) * s0_0_2;
	r1 += M4(-2.955e-02, -4.575e-02, -5.702e-02, -6.456e-02, 1.891e-02, 8.329e-03, -1.152e-02, -1.725e-02, -3.204e-02, -5.366e-02, 3.139e-02, 2.803e-02, -5.345e-02, -2.917e-02, 4.308e-02, 1.417e-02) * s0_0_2;
	r0 += M4(9.126e-03, -4.120e-02, -1.013e-02, -5.310e-02, 1.366e-02, -2.964e-02, 5.143e-02, -3.568e-02, -3.387e-02, 1.988e-02, -9.974e-02, -4.701e-02, 8.866e-03, 9.222e-02, -1.395e-01, -1.836e-01) * s0_1_0;
	r1 += M4(1.880e-02, -2.177e-02, -5.575e-02, -9.597e-02, 6.408e-02, 4.381e-02, -7.885e-03, 1.561e-02, -9.348e-02, -2.549e-01, -5.822e-02, 2.057e-02, -1.165e-01, -3.967e-02, -4.344e-02, -2.998e-02) * s0_1_0;
	r0 += M4(-1.440e-01, -3.040e-02, -3.236e-01, -1.499e-01, -4.102e-02, 2.647e-01, -1.509e-01, 6.068e-02, -3.959e-02, 3.697e-02, -2.157e-01, -1.821e-01, 2.023e-01, 1.480e-02, 6.554e-01, 5.209e-01) * s0_1_1;
	r1 += M4(-3.724e-01, -2.549e-01, -3.003e-01, -1.078e-01, -1.602e-01, -1.646e-01, 3.758e-02, 3.927e-03, -1.677e-01, 2.715e-01, -5.077e-01, -4.595e-01, 5.335e-01, 2.163e-01, 2.900e-01, 1.685e-01) * s0_1_1;
	r0 += M4(8.183e-03, -4.023e-04, -1.879e-01, -2.264e-01, 4.220e-03, -3.498e-02, -1.331e-02, -2.288e-01, -9.394e-02, -3.280e-02, -2.920e-01, 1.199e-01, -3.059e-02, 2.659e-02, -1.178e-01, 7.759e-02) * s0_1_2;
	r1 += M4(-6.102e-02, -1.143e-02, -1.993e-01, -2.582e-01, 3.449e-02, -5.914e-02, -9.840e-02, -6.662e-02, -1.440e-01, -8.385e-02, 1.275e-01, 1.194e-01, -7.479e-02, -2.426e-02, 6.519e-02, 3.552e-02) * s0_1_2;
	r0 += M4(1.130e-02, 1.139e-03, -5.263e-02, -1.117e-02, 8.966e-03, -3.280e-02, -8.356e-03, -1.395e-02, -3.973e-02, -1.654e-02, -4.564e-02, -1.649e-02, -7.006e-02, -6.662e-03, 1.284e-01, 7.937e-02) * s0_2_0;
	r1 += M4(-1.801e-02, -8.418e-02, 1.355e-03, -5.065e-02, -2.441e-02, -7.882e-03, 1.015e-02, -2.830e-02, -5.104e-02, 2.520e-03, -8.132e-02, 5.818e-04, 3.890e-02, -8.597e-03, 2.645e-03, 8.467e-02) * s0_2_0;
	r0 += M4(-9.676e-02, 8.400e-03, -1.191e-01, -6.689e-02, -7.737e-02, -4.395e-02, 1.928e-01, 5.486e-02, -6.804e-02, 2.863e-03, -1.160e-01, -2.422e-02, 3.778e-01, -1.599e-03, 1.075e-01, -2.535e-02) * s0_2_1;
	r1 += M4(-2.156e-02, -5.837e-02, -1.528e-01, -1.558e-01, 1.443e-01, 4.070e-02, 7.379e-02, 2.173e-01, -2.083e-01, -4.933e-02, -4.823e-02, -7.528e-02, 1.158e-01, 7.167e-02, 9.007e-02, 8.758e-02) * s0_2_1;
	r0 += M4(-2.193e-01, -1.920e-02, -1.048e-01, -7.545e-03, 1.606e-02, -1.691e-02, 5.414e-01, 1.433e-01, -8.564e-03, -9.462e-03, 1.249e-01, -2.499e-02, 3.177e-02, -2.087e-02, -6.915e-02, -1.002e-02) * s0_2_2;
	r1 += M4(-2.017e-01, -5.139e-02, -1.830e-01, -1.392e-01, 5.553e-01, 3.374e-02, 1.725e-01, 5.684e-01, -1.743e-01, -5.944e-02, -5.204e-02, 8.925e-03, 1.055e-02, 5.162e-03, 3.498e-02, 3.258e-02) * s0_2_2;
	r0 += M4(3.678e-02, 1.629e-01, 6.595e-02, -1.861e-02, -2.099e-02, -3.428e-01, 9.540e-02, 4.846e-02, -2.490e-02, 3.321e-01, 4.199e-03, 6.656e-03, -8.738e-03, 1.489e-01, 2.686e-02, -6.443e-02) * s1_0_0;
	r1 += M4(6.461e-02, 1.409e-01, -6.717e-02, 4.148e-02, -9.652e-02, 1.268e-01, 1.239e-02, -5.204e-04, -1.679e-02, -9.576e-02, -1.542e-01, -2.646e-02, 3.772e-02, 6.487e-02, -2.121e-01, -3.056e-02) * s1_0_0;
	r0 += M4(-2.440e-02, 4.577e-03, -1.810e-01, 2.439e-02, 3.700e-03, 5.432e-02, 1.037e-01, 1.945e-01, 1.665e-02, -9.334e-03, 5.857e-02, -2.285e-02, -4.424e-03, 4.373e-02, -4.001e-02, 2.017e-01) * s1_0_1;
	r1 += M4(-2.447e-02, -1.029e-01, -1.252e-02, -5.774e-02, -3.855e-01, 8.815e-02, -1.674e-01, 3.661e-02, -3.911e-02, 2.363e-02, -2.201e-01, -3.257e-03, -1.766e-02, 1.194e-01, 2.847e-01, 1.122e-01) * s1_0_1;
	r0 += M4(-1.791e-02, -2.829e-02, 1.540e-03, -2.984e-02, 1.677e-02, -4.450e-03, -2.620e-02, 2.042e-02, 1.048e-02, -3.981e-03, 4.460e-02, 2.416e-02, -4.218e-02, 3.384e-02, -1.567e-01, -3.369e-01) * s1_0_2;
	r1 += M4(4.857e-02, 1.580e-03, -4.158e-02, -3.985e-03, -2.953e-03, -3.022e-02, -4.629e-02, -1.547e-02, -1.861e-02, 1.650e-02, 3.735e-03, 2.089e-02, 4.628e-02, -1.447e-02, -1.527e-02, 9.855e-04) * s1_0_2;
	r0 += M4(2.157e-03, -1.636e-01, 3.494e-02, -2.480e-01, -2.364e-03, 1.866e-02, 2.316e-02, 4.932e-02, -3.587e-02, -2.583e-01, 9.245e-02, 6.644e-02, 6.054e-03, -2.759e-04, 2.959e-02, -1.896e-02) * s1_1_0;
	r1 += M4(1.819e-02, -1.177e-02, -1.135e-01, -2.049e-02, 3.982e-02, -1.596e-01, -8.678e-02, -5.706e-02, 9.119e-02, 6.275e-02, 7.862e-02, 2.759e-02, 2.108e-02, 2.818e-02, 9.469e-03, -4.247e-02) * s1_1_0;
	r0 += M4(-1.350e-01, -2.472e-02, -8.122e-01, 1.313e-01, 6.739e-02, 4.771e-02, -8.613e-01, -3.199e-01, 7.574e-02, -2.056e-01, 3.135e-01, -2.740e-02, -1.285e-03, -4.899e-02, 2.347e-01, 1.509e-01) * s1_1_1;
	r1 += M4(-8.007e-01, -2.986e-01, -8.228e-01, -3.253e-01, -3.417e-02, 8.006e-02, -3.084e-01, -6.859e-02, 4.424e-01, 1.803e-01, 5.795e-01, 2.254e-01, 1.462e-01, 2.214e-01, 4.090e-01, 4.902e-01) * s1_1_1;
	r0 += M4(5.167e-02, -8.877e-03, -1.054e-01, -1.766e-01, -5.628e-02, 8.474e-03, -1.577e-01, 4.283e-02, -4.050e-02, 4.060e-02, -5.912e-02, 9.550e-02, -6.229e-03, -3.450e-02, -1.097e-01, 8.152e-02) * s1_1_2;
	r1 += M4(-7.734e-03, 1.437e-02, -1.890e-01, -1.199e-01, -8.768e-02, -2.773e-02, -3.185e-02, 1.835e-03, 2.713e-02, 2.265e-02, 1.229e-01, 6.237e-02, -8.249e-01, -5.308e-02, -2.060e-01, -2.290e-01) * s1_1_2;
	r0 += M4(3.099e-02, 4.629e-03, -7.357e-02, -2.444e-02, -1.570e-02, -1.352e-02, -2.234e-02, 6.757e-03, 4.977e-02, 2.993e-03, -2.964e-02, 7.493e-02, 6.430e-03, 2.338e-02, 2.451e-02, -8.187e-06) * s1_2_0;
	r1 += M4(3.510e-02, 2.769e-02, -6.170e-02, -1.790e-01, -4.872e-04, -1.897e-02, -1.637e-03, -1.837e-02, -2.140e-02, 1.249e-01, 2.238e-01, 8.319e-02, 1.317e-02, 1.995e-02, 4.378e-02, -2.783e-02) * s1_2_0;
	r0 += M4(8.821e-02, -2.253e-04, -1.372e-01, -9.655e-02, -3.242e-02, 8.381e-03, -4.445e-02, -3.832e-02, 4.310e-01, -2.069e-02, 1.122e-01, 2.599e-02, 8.130e-02, 1.273e-02, 3.906e-02, 1.117e-02) * s1_2_1;
	r1 += M4(2.618e-02, -6.120e-02, -5.526e-02, -2.691e-02, -1.167e-01, -5.927e-03, -5.483e-02, -2.290e-01, 2.741e-02, -3.821e-02, -8.966e-02, -6.514e-04, 7.408e-02, 6.070e-03, 7.055e-02, 2.638e-02) * s1_2_1;
	r0 += M4(-5.564e-02, 8.275e-03, -1.941e-01, 6.913e-03, -9.782e-03, -6.857e-04, -1.345e-02, 2.694e-02, 4.135e-02, 5.690e-03, 1.075e-02, -5.023e-02, -2.679e-02, 1.055e-02, -4.965e-02, -8.235e-03) * s1_2_2;
	r1 += M4(5.444e-02, -2.594e-03, 2.130e-02, 1.360e-02, -3.800e-03, 9.712e-03, 5.211e-02, 1.497e-02, 2.305e-03, 1.330e-03, 2.908e-02, -3.352e-02, 8.081e-02, 1.196e-02, 7.949e-02, 1.716e-02) * s1_2_2;
	r0 = clamp(r0, V4(0.0), V4(1.0));
	imageStore(out_image, opos + ivec2(0, 0), vec4(r0));
	r1 = clamp(r1, V4(0.0), V4(1.0));
	imageStore(out_image, opos + ivec2(1, 0), vec4(r1));
}

//!DESC [CuNNy_faster_SOFT] -out-shuffle
//!HOOK LUMA
//!COMPUTE 16 16 8 8
//!BIND conv2
//!BIND LUMA
//!WIDTH LUMA.w 2 *
//!HEIGHT LUMA.h 2 *
//!COMPONENTS 1
//!WHEN OUTPUT.w LUMA.w 1.200 * > OUTPUT.h LUMA.h 1.200 * > *
#extension GL_EXT_shader_explicit_arithmetic_types_float16 : enable
#ifdef GL_EXT_shader_explicit_arithmetic_types_float16
#	define V4 f16vec4
#	define M4 f16mat4
#	define F float16_t
#else
#	define V4 vec4
#	define M4 mat4
#	define F float
#endif
#define l0(x, y) V4((conv2_mul * texelFetch(conv2_raw, clamp(pos + ivec2(x, y), ivec2(0), sz) * ivec2(2, 1) + ivec2(0, 0), 0)))
#define l1(x, y) V4((conv2_mul * texelFetch(conv2_raw, clamp(pos + ivec2(x, y), ivec2(0), sz) * ivec2(2, 1) + ivec2(1, 0), 0)))
shared V4 G[2][10][10];
void hook() {
	ivec2 xy = ivec2(gl_LocalInvocationID.xy);
	ivec2 pos = ivec2(gl_WorkGroupID.xy) * ivec2(8, 8) + xy;
	ivec2 opos = pos * ivec2(2, 2);
	ivec2 sz = ivec2(LUMA_size) - ivec2(1);
	for (int y = 0; y < 10; y += 8) {
		int ay = xy.y + y;
		if (ay >= 10) break;
		for (int x = 0; x < 10; x += 8) {
			int ax = xy.x + x;
			if (ax >= 10) break;
			G[0][ay][ax] = l0(x - 1, y - 1);
			G[1][ay][ax] = l1(x - 1, y - 1);
		}
	}
	barrier();
	V4 s0_0_0, s0_0_1, s0_0_2, s0_1_0, s0_1_1, s0_1_2, s0_2_0, s0_2_1, s0_2_2, s1_0_0, s1_0_1, s1_0_2, s1_1_0, s1_1_1, s1_1_2, s1_2_0, s1_2_1, s1_2_2;
	V4 r0;
	r0 = V4(0.0);
	s0_0_0 = G[0][xy.y+0][xy.x+0]; s0_0_1 = G[0][xy.y+0][xy.x+1];
	s0_0_2 = G[0][xy.y+0][xy.x+2]; s0_1_0 = G[0][xy.y+1][xy.x+0];
	s0_1_1 = G[0][xy.y+1][xy.x+1]; s0_1_2 = G[0][xy.y+1][xy.x+2];
	s0_2_0 = G[0][xy.y+2][xy.x+0]; s0_2_1 = G[0][xy.y+2][xy.x+1];
	s0_2_2 = G[0][xy.y+2][xy.x+2]; s1_0_0 = G[1][xy.y+0][xy.x+0];
	s1_0_1 = G[1][xy.y+0][xy.x+1]; s1_0_2 = G[1][xy.y+0][xy.x+2];
	s1_1_0 = G[1][xy.y+1][xy.x+0]; s1_1_1 = G[1][xy.y+1][xy.x+1];
	s1_1_2 = G[1][xy.y+1][xy.x+2]; s1_2_0 = G[1][xy.y+2][xy.x+0];
	s1_2_1 = G[1][xy.y+2][xy.x+1]; s1_2_2 = G[1][xy.y+2][xy.x+2];
	r0 += M4(2.510e-01, -2.566e-02, 1.076e-01, 9.371e-03, -1.089e-07, 4.047e-07, -2.185e-07, -3.217e-07, -3.883e-03, -1.500e-03, 6.819e-04, -6.391e-04, 8.959e-03, -6.730e-03, -6.848e-03, -2.396e-03) * s0_0_0;
	r0 += M4(-3.139e-01, 3.573e-01, -7.421e-02, -2.367e-01, -1.290e-07, 8.812e-08, -8.623e-07, 5.537e-07, 1.067e-02, 8.927e-03, -6.301e-04, -4.654e-03, 3.959e-03, 4.633e-02, 7.129e-03, 3.835e-03) * s0_0_1;
	r0 += M4(-1.325e-02, -1.538e-01, -7.366e-03, 6.785e-04, 1.031e-07, -3.190e-07, 1.196e-06, -1.594e-07, 3.005e-03, 4.086e-03, 3.224e-03, -1.480e-02, 3.055e-04, 1.465e-03, -6.149e-04, -4.676e-03) * s0_0_2;
	r0 += M4(-8.878e-03, -9.821e-04, 1.096e-01, -3.794e-03, 3.351e-07, -2.068e-03, 1.239e-06, -1.966e-04, 6.028e-02, 3.110e-04, 1.110e-01, 3.410e-03, 1.037e-01, 2.405e-02, 1.284e-01, 6.533e-04) * s0_1_0;
	r0 += M4(1.207e-02, -1.241e-02, 1.945e-01, 3.271e-01, -1.295e-06, -9.311e-03, 1.318e-04, 1.581e-04, -4.893e-01, 9.114e-02, 1.602e-01, 5.961e-01, 2.548e-01, -6.932e-01, 2.209e-02, -4.812e-02) * s0_1_1;
	r0 += M4(-4.439e-03, 8.852e-03, 1.795e-02, -2.674e-03, 1.064e-06, 1.140e-02, -1.323e-04, 3.804e-05, 8.667e-03, -2.150e-01, -2.854e-02, -6.806e-02, 1.164e-02, 1.255e-01, 4.803e-03, 1.019e-02) * s0_1_2;
	r0 += M4(-2.768e-05, 1.131e-04, -3.375e-04, 2.552e-07, 4.732e-03, -3.805e-04, 1.789e-03, 1.529e-03, 1.551e-03, 6.016e-05, -3.849e-02, -6.007e-03, -3.805e-03, 2.920e-03, 3.716e-02, -2.673e-03) * s0_2_0;
	r0 += M4(1.358e-04, -1.404e-04, -6.103e-03, -4.832e-04, -3.505e-01, 2.370e-02, -1.042e-01, -1.921e-02, 6.589e-03, 4.104e-03, -9.547e-02, -8.447e-02, -3.214e-03, 2.240e-02, -9.908e-02, -3.365e-01) * s0_2_1;
	r0 += M4(-3.990e-06, -3.952e-06, 2.283e-03, -1.580e-03, 1.745e-02, -6.855e-01, 7.016e-02, 3.044e-01, 2.304e-03, -6.626e-04, 2.136e-02, -3.143e-02, 2.451e-03, -4.946e-03, -1.381e-02, 7.494e-03) * s0_2_2;
	r0 += M4(2.101e-02, 8.086e-06, -1.418e-02, -1.513e-03, 2.097e-03, 1.218e-03, -8.258e-03, -7.411e-04, -6.468e-02, 2.912e-03, -3.844e-03, -2.964e-03, 4.454e-02, 2.831e-03, 2.538e-03, 2.714e-03) * s1_0_0;
	r0 += M4(-4.562e-02, 5.503e-02, 2.661e-02, 1.529e-03, -6.190e-02, 2.581e-02, 1.711e-02, 1.976e-03, -1.064e-01, -2.280e-01, 2.075e-03, 1.422e-02, 2.725e-01, 1.950e-01, -1.562e-02, 4.893e-03) * s1_0_1;
	r0 += M4(5.021e-03, -1.778e-02, -3.090e-04, -1.526e-02, -1.413e-02, 6.117e-03, -9.800e-03, -1.895e-02, -7.460e-03, -3.119e-02, -5.146e-03, 9.612e-03, -4.132e-03, 7.254e-02, 2.927e-03, 6.278e-03) * s1_0_2;
	r0 += M4(3.689e-02, 1.056e-02, 4.516e-02, -1.052e-02, 8.769e-03, 1.687e-03, 7.560e-03, 6.384e-04, -7.053e-02, 5.967e-03, -9.399e-02, 1.409e-02, -4.725e-02, -2.427e-02, -1.140e-01, -1.150e-02) * s1_1_0;
	r0 += M4(2.391e-01, 2.252e-01, -8.145e-01, -1.859e-02, -3.330e-01, -1.535e-02, -3.662e-01, 1.404e-03, 5.684e-01, 6.219e-02, 7.237e-02, -4.071e-01, -1.978e-01, -1.825e-01, 5.882e-01, -1.812e-01) * s1_1_1;
	r0 += M4(-1.739e-02, -3.925e-02, 1.183e-02, -2.045e-01, 1.211e-02, 4.067e-01, 3.819e-02, 3.136e-01, -5.208e-03, 1.247e-01, 2.798e-02, 4.583e-02, 3.473e-03, 2.618e-02, -2.886e-02, 1.554e-01) * s1_1_2;
	r0 += M4(6.890e-04, -5.187e-04, 2.202e-02, 1.943e-03, -3.007e-03, -1.668e-03, 5.449e-03, 9.238e-05, -5.150e-03, 2.220e-03, -3.884e-03, 4.636e-03, -8.638e-04, 9.015e-05, -2.889e-02, -5.227e-03) * s1_2_0;
	r0 += M4(1.531e-03, 4.723e-03, 6.714e-02, 3.290e-02, 4.806e-03, 7.509e-03, -9.137e-02, 1.244e-02, -5.932e-03, -8.205e-03, 9.082e-02, 4.309e-02, 8.945e-04, -8.047e-03, -3.722e-02, -7.220e-03) * s1_2_1;
	r0 += M4(-1.170e-03, 7.123e-03, -1.159e-02, -1.777e-02, -2.442e-04, -1.921e-02, -2.884e-02, 7.229e-02, 1.425e-03, -1.869e-03, -2.775e-03, 2.045e-02, -9.966e-04, -2.167e-03, -3.166e-03, 8.534e-03) * s1_2_2;
	r0 += V4(1.471e-09, -2.972e-10, -1.438e-08, -1.472e-08);
	vec2 opt = 0.5 * LUMA_pt;
	vec2 fpos = (vec2(opos) + vec2(0.5)) * opt;
	imageStore(out_image, opos + ivec2(0, 0), vec4(r0.x + LUMA_tex(fpos + vec2(0.0, 0.0) * opt).r, 0.0, 0.0, 1.0));
	imageStore(out_image, opos + ivec2(1, 0), vec4(r0.y + LUMA_tex(fpos + vec2(1.0, 0.0) * opt).r, 0.0, 0.0, 1.0));
	imageStore(out_image, opos + ivec2(0, 1), vec4(r0.z + LUMA_tex(fpos + vec2(0.0, 1.0) * opt).r, 0.0, 0.0, 1.0));
	imageStore(out_image, opos + ivec2(1, 1), vec4(r0.w + LUMA_tex(fpos + vec2(1.0, 1.0) * opt).r, 0.0, 0.0, 1.0));
}
